# For parsing the csv
import csv
import urllib2
import StringIO
import math
# For actual computations
import random as rand
import pandas as pd
In 2010 Google released a page listing the top 1000 websites on the internet: https://web.archive.org/web/20130102235318/http://www.google.com/adplanner/static/top1000
The page has since been taken down, but you can still find the data mirrored.
Let's take a look at it!
websites_url = 'https://raw.githubusercontent.com/ledeprogram/courses/master/platforms/anonymization/googletop1000april2010.csv'
websites_response = urllib2.urlopen(websites_url)
websites = pd.read_csv(websites_response)
websites
Rank | Unique Visitors (users) | Page Views | Reach | Site | Category | Has Advertising | |
---|---|---|---|---|---|---|---|
0 | 1 | 540000000 | 570000000000 | 35.2% | facebook.com | Social Networks | Yes |
1 | 2 | 490000000 | 70000000000 | 31.8% | yahoo.com | Web Portals | Yes |
2 | 3 | 370000000 | 39000000000 | 24.1% | live.com | Search Engines | Yes |
3 | 4 | 310000000 | 7900000000 | 20% | wikipedia.org | Dictionaries & Encyclopedias | No |
4 | 5 | 280000000 | 11000000000 | 18.1% | msn.com | Web Portals | Yes |
5 | 6 | 230000000 | 3300000000 | 14.8% | microsoft.com | Software | Yes |
6 | 7 | 230000000 | 4400000000 | 14.7% | blogspot.com | Blogging Resources & Services | Yes |
7 | 8 | 230000000 | 27000000000 | 15% | baidu.com | Web Portals | Yes |
8 | 9 | 170000000 | 25000000000 | 11.1% | qq.com | Email & Messaging | Yes |
9 | 10 | 140000000 | 2100000000 | 9.2% | mozilla.com | Internet Clients & Browsers | No |
10 | 11 | 130000000 | 3600000000 | 8.4% | sina.com.cn | Web Portals | Yes |
11 | 12 | 120000000 | 1200000000 | 7.7% | wordpress.com | Blogging Resources & Services | Yes |
12 | 13 | 110000000 | 2700000000 | 7.% | bing.com | Search Engines | Yes |
13 | 14 | 110000000 | 1000000000 | 6.9% | adobe.com | Programming | Yes |
14 | 15 | 98000000 | 2700000000 | 6.3% | 163.com | Web Portals | Yes |
15 | 16 | 98000000 | 10000000000 | 6.3% | taobao.com | Shopping | No |
16 | 17 | 97000000 | 1400000000 | 6.3% | soso.com | Entertainment | No |
17 | 18 | 96000000 | 5400000000 | 6.2% | twitter.com | Email & Messaging | No |
18 | 19 | 89000000 | 1700000000 | 5.8% | youku.com | Video Clips & Movie Downloads | Yes |
19 | 20 | 88000000 | 1700000000 | 5.7% | ask.com | Search Engines | Yes |
20 | 21 | 82000000 | 1900000000 | 5.3% | sohu.com | Web Portals | Yes |
21 | 22 | 74000000 | 3300000000 | 4.8% | amazon.com | Shopping | Yes |
22 | 23 | 74000000 | 490000000 | 4.8% | windows.com | Windows | No |
23 | 24 | 74000000 | 9400000000 | 4.8% | ebay.com | Auctions | Yes |
24 | 25 | 72000000 | 27000000000 | 4.7% | yahoo.co.jp | Web Portals | Yes |
25 | 26 | 72000000 | 27000000000 | 4.7% | myspace.com | Social Networks | Yes |
26 | 27 | 72000000 | 960000000 | 4.7% | apple.com | Mac | Yes |
27 | 28 | 66000000 | 1100000000 | 4.3% | tudou.com | Photo & Video Sharing | No |
28 | 29 | 60000000 | 2000000000 | 3.9% | conduit.com | Advertising & Marketing | No |
29 | 30 | 60000000 | 1100000000 | 3.9% | hotmail.com | Email & Messaging | Yes |
30 | 31 | 55000000 | 1800000000 | 3.6% | flickr.com | Photo & Video Sharing | Yes |
31 | 32 | 55000000 | 1100000000 | 3.6% | photobucket.com | Photo & Video Sharing | Yes |
32 | 33 | 55000000 | 590000000 | 3.6% | tianya.cn | Online Communities | Yes |
33 | 34 | 55000000 | 710000000 | 3.6% | about.com | How-To & Expert Content | Yes |
34 | 35 | 55000000 | 490000000 | 3.6% | cnet.com | Technology News | Yes |
35 | 36 | 50000000 | 1400000000 | 3.3% | hao123.com | Online Directories | No |
36 | 37 | 50000000 | 270000000 | 3.2% | iefxz.com | NaN | No |
37 | 38 | 50000000 | 870000000 | 3.2% | xunlei.com | TV Programs | No |
38 | 39 | 49000000 | 1900000000 | 3.2% | paypal.com | Merchant Services & Payment Systems | Yes |
39 | 40 | 46000000 | 800000000 | 3% | rapidshare.com | File Sharing & Hosting | No |
40 | 41 | 46000000 | 3000000000 | 3% | go.com | Web Portals | Yes |
41 | 42 | 45000000 | 2400000000 | 2.9% | fc2.com | Blogging Resources & Services | Yes |
42 | 43 | 45000000 | 2500000000 | 2.9% | bbc.co.uk | News & Current Events | Yes |
43 | 44 | 45000000 | 1400000000 | 2.9% | imdb.com | Movies | Yes |
44 | 45 | 45000000 | 5300000000 | 2.9% | orkut.com | Social Networks | Yes |
45 | 46 | 45000000 | 540000000 | 2.9% | sogou.com | Web Portals | No |
46 | 47 | 42000000 | 450000000 | 2.7% | 56.com | Multimedia Content | No |
47 | 48 | 42000000 | 4400000000 | 2.7% | aol.com | Web Portals | Yes |
48 | 49 | 42000000 | 14000000000 | 2.7% | craigslist.org | Classifieds | No |
49 | 50 | 41000000 | 4000000000 | 2.6% | rakuten.co.jp | Shopping Portals & Search Engines | Yes |
50 | 51 | 41000000 | 310000000 | 2.7% | imageshack.us | File Sharing & Hosting | Yes |
51 | 52 | 41000000 | 410000000 | 2.7% | ku6.com | Multimedia Content | Yes |
52 | 53 | 41000000 | 1700000000 | 2.7% | blogger.com | Blogging Resources & Services | Yes |
53 | 54 | 41000000 | 810000000 | 2.6% | goo.ne.jp | Web Services | Yes |
54 | 55 | 41000000 | 860000000 | 2.7% | ifeng.com | News & Current Events | Yes |
55 | 56 | 38000000 | 1700000000 | 2.5% | linkedin.com | Social Networks | Yes |
56 | 57 | 38000000 | 7000000000 | 2.4% | yandex.ru | Search Engines | Yes |
57 | 58 | 37000000 | 10000000000 | 2.4% | mail.ru | Email & Messaging | Yes |
58 | 59 | 35000000 | 280000000 | 2.2% | partypoker.com | Cards & Casino Games | No |
59 | 60 | 34000000 | 880000000 | 2.2% | megaupload.com | File Sharing & Hosting | No |
... | ... | ... | ... | ... | ... | ... |
1001 rows × 7 columns